ATAC Postclustering
The following Rscript visualizes the ATAC clustering with plots of clusters, dendrogram, superclusters, etc.
github raw
library(lattice)
library(data.table)
source('plot.traces.R')
load('clusters.all.minc100.1e8.Rdata')
Generate plot.df object
plot.df = clusters.all.test.1e8$normalized
plot.df$sample.conditions = as.character(plot.df$sample.conditions)
plot.df$sample.conditions[plot.df$sample.conditions == 't0'] = 0
plot.df$sample.conditions[plot.df$sample.conditions == '20min'] = 20
plot.df$sample.conditions[plot.df$sample.conditions == '40min'] = 40
plot.df$sample.conditions[plot.df$sample.conditions == '60min'] = 60
plot.df$sample.conditions[plot.df$sample.conditions == '2hr'] = 120
plot.df$sample.conditions[plot.df$sample.conditions == '3hr'] = 180
plot.df$sample.conditions[plot.df$sample.conditions == '4hr'] = 240
plot.df$sample.conditions = as.numeric(plot.df$sample.conditions)
plot.df = plot.df[order(plot.df$genes),]
plot.df = plot.df[order(plot.df$sample.conditions),]
plot.df$cluster = paste('cluster', as.character(plot.df$cluster), sep = '')
plot.df$chr = sapply(strsplit(plot.df$genes, '[.]'), '[', 1)
plot.df$start = sapply(strsplit(plot.df$genes, '[.]'), '[', 2)
plot.df$end = sapply(strsplit(plot.df$genes, '[.]'), '[', 3)
save(plot.df,file='plot.df.Rdata')
write.table(cbind(plot.df$chr, plot.df$start, plot.df$end),
file = 'dynamic_peaks.bed', quote = FALSE,
sep = '\t', col.names=FALSE, row.names=FALSE)
Plot all clusters
for (i in unique(plot.df$cluster)) {
print(i)
write.table(plot.df[plot.df$cluster == i,
c('chr','start','end', 'value', 'cluster')][
!duplicated(plot.df[plot.df$cluster == i,]$genes),],
file = paste0('cluster_bed_',
gsub(" ", "", i, fixed = TRUE),'.bed'),
quote = FALSE, row.names = FALSE, col.names = FALSE, sep = '\t')
}
pdf('atac_clusters.pdf', width=11, height=15)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | cluster, group = genes, data = plot.df,
type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45),
y =list(cex=1.0, relation="free")),
aspect=1.0,
between=list(y=0.5, x=0.5),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),
col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = c('#99999980'), lwd=c(1),
lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15,
do.out = FALSE)
panel.spline(x, y, col = 'blue', lwd =2.0, ...)
})
)
dev.off()
#up.flat - 9,23,17,11
#grad.up - 5,8,10,1
#up.down - 6,2,12
#grad.down - 7,3,4,13,14
#down.up - 24
Plot dendrogram
x = as.data.table(plot.df)
plot.df.cluster = dcast(x, genes + cluster ~ sample.conditions, value.var="value")
avg.clusters = as.data.frame(matrix(nrow = 0, ncol = 7))
for (i in unique(plot.df.cluster$cluster)) {
z = data.frame(matrix(colMeans(plot.df.cluster[plot.df.cluster$cluster == i,3:9]),
ncol = 7, nrow = 1))
rownames(z) = c(i)
colnames(z) = as.character(colnames(plot.df.cluster)[3:9])
avg.clusters = rbind(avg.clusters, z)
}
dd = dist(avg.clusters)
hc = hclust(dd, method = "complete")
pdf('dendrogram.pdf', width=8, height=5)
plot(hc, xlab = "Clusters", main = ' ', hang = -1)
abline(h = 2, lty = 2)
dev.off()
Plot clusters organized by supercluster
df = data.frame(index=1:17,cluster=unique(plot.df$cluster)[order(unique(plot.df$cluster))])
df$cluster.num = as.integer(sapply(strsplit(df$cluster, 'cluster'), '[', 2))
df = df[order(df$cluster.num),]
df = df[reorder(df$cluster.num,c(23,17,9,11,5,8,1,10,12,2,6,24,14,13,7,4,3)),]
pdf('atac_clusters_org_by_sc.pdf', width=11, height=15)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | cluster, group = genes, data = plot.df,
type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45),
y =list(cex=1.0, relation="free")),
aspect=1.0,
layout = c(5,5),
between=list(y=0.5, x=0.5),
index.cond=list(rev(df$index)),
skip = c(F,F,F,F,F,
F,T,T,T,T,
F,F,F,T,T,
F,F,F,F,T,
F,F,F,F,T),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),
col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = c('#99999980'), lwd=c(1),
lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15,
do.out = FALSE)
panel.spline(x, y, col = 'blue', lwd =2.0, ...)
})
)
dev.off()
Plot supercluster traces
gradual.down = plot.df[plot.df$cluster == 'cluster7' |
plot.df$cluster == 'cluster3' |
plot.df$cluster == 'cluster4' |
plot.df$cluster == 'cluster13' |
plot.df$cluster == 'cluster14',]
down.up = plot.df[plot.df$cluster == 'cluster24',]
gradual.up = plot.df[plot.df$cluster == 'cluster5' |
plot.df$cluster == 'cluster8' |
plot.df$cluster == 'cluster10' |
plot.df$cluster == 'cluster1',]
up.flat = plot.df[plot.df$cluster == 'cluster9' |
plot.df$cluster == 'cluster23' |
plot.df$cluster == 'cluster17' |
plot.df$cluster == 'cluster11',]
up.down = plot.df[plot.df$cluster == 'cluster6' |
plot.df$cluster == 'cluster12' |
plot.df$cluster == 'cluster2',]
gradual.down$supercluster = 'gradual.down'
down.up$supercluster = 'down.up'
gradual.up$supercluster = 'gradual.up'
up.flat$supercluster = 'up.flat'
up.down$supercluster = 'up.down'
nrow(gradual.down)/7
nrow(down.up)/7
nrow(gradual.up)/7
nrow(up.flat)/7
nrow(up.down)/7
plot.df.atac = rbind(gradual.down,
down.up,
gradual.up,
up.flat,
up.down)
plot.df.atac = plot.df.atac[,-(7:26)]
plot.df.atac$genes = paste0(plot.df.atac$chr,':',plot.df.atac$start,'-',plot.df.atac$end)
save(plot.df.atac,file='plot.df.atac.Rdata')
pdf('atac_superclusters.pdf', width=6.83, height=5)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | supercluster, group = genes,
data = plot.df.atac, type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45),
y =list(cex=1.0, relation="free")),
aspect=1.0,
between=list(y=0.5, x=0.5),
layout = c(5,1),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),
col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = c('#99999980'), lwd=c(1),
lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15,
do.out = FALSE)
panel.spline(x, y, col = 'blue', lwd =2.0, ...)
})
)
dev.off()
Plot individual traces
df = data.frame(sc = c("down.up","gradual.down","gradual.up","up.down","up.flat"),
col = c('orange1','#4169E1','#FF0000','green2','#A020F0'))
for(sc in unique(plot.df.atac$supercluster)) {
col = df[df$sc == sc,]$col
y = plot.df.atac[plot.df.atac$supercluster == sc,]
pdf(file=paste0(sc,'.traces.pdf'))
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions, group = genes, data = y, type = c('l'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45),
y = list(cex=1.0, relation="free")),
aspect=1.0,
between=list(y=0.5, x=0.5),
main = list(label = paste0(sc, ' traces'), cex = 1.5),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),
col=c('grey20'), cex =0.5),
strip.background = list(col="grey80"),
superpose.line = list(col = c('#99999980'),
lwd=c(1),lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15,
do.out = FALSE)
panel.spline(x, y, col = 'blue', lwd = 3.5, ...)
#replace col = 'blue' with col = col for different sc colors
})
)
dev.off()
}
Generating .bed file for each cluster
We will generate .bed file for each cluster in order to perform motif enrichment analyses.
for (i in unique(plot.df.atac$supercluster)) {
print(i)
write.table(plot.df.atac[plot.df.atac$supercluster == i,
c('chr','start','end', 'value', 'supercluster')][
!duplicated(plot.df.atac[plot.df.atac$supercluster == i,]$genes),],
file = paste0(i,'.bed'),
quote = FALSE, row.names = FALSE, col.names = FALSE, sep = '\t')
}
Generating comprehensive ATAC dataframe
Rerun analysis to generate final ATAC dataframe containing all relevant features.
github raw
library(DESeq2)
categorize.deseq.df <- function(df, fdr = 0.05, log2fold = 0.0, treat
= 'Auxin') {
df.effects.lattice = df
df.effects.lattice$response = 'All Other Peaks'
if(nrow(df.effects.lattice[df.effects.lattice$padj < fdr &
!is.na(df.effects.lattice$padj) &
df.effects.lattice$log2FoldChange > log2fold,]) > 0) {
df.effects.lattice[df.effects.lattice$padj < fdr &
!is.na(df.effects.lattice$padj) &
df.effects.lattice$log2FoldChange > log2fold,]$response = 'Increased'
}
if(nrow(df.effects.lattice[df.effects.lattice$padj < fdr &
!is.na(df.effects.lattice$padj) &
df.effects.lattice$log2FoldChange < log2fold,]) > 0) {
df.effects.lattice[df.effects.lattice$padj < fdr &
!is.na(df.effects.lattice$padj) &
df.effects.lattice$log2FoldChange < log2fold,]$response = 'Decreased'
}
if(nrow(df.effects.lattice[df.effects.lattice$padj > 0.5 &
!is.na(df.effects.lattice$padj) &
abs(df.effects.lattice$log2FoldChange) < 0.25,]) > 0) {
df.effects.lattice[df.effects.lattice$padj > 0.5 &
!is.na(df.effects.lattice$padj) &
abs(df.effects.lattice$log2FoldChange) < 0.25,]$response = 'Unchanged'
}
return(df.effects.lattice)
}
setwd("/scratch/bhn9by/ATAC")
peaks = unique(read.table('all_peaks.bed',sep='\t'))
peaks$location = paste0(peaks$V1,':',peaks$V2,'-',peaks$V3)
df = data.frame(row.names = peaks$location,chr=peaks[,1],start=peaks[,2],end=peaks[,3])
Add counts
load('normalized.counts.atac.Rdata')
zero.min = c()
twenty.min = c()
forty.min = c()
sixty.min = c()
onetwenty.min = c()
oneeighty.min = c()
twoforty.min = c()
genes = c()
print('Getting ATAC means')
for (i in 1:nrow(normalized.counts.atac)) {
zero.min = append(zero.min, mean(normalized.counts.atac[i,19:21]))
twenty.min = append(twenty.min, mean(normalized.counts.atac[i,1:3]))
forty.min = append(forty.min, mean(normalized.counts.atac[i,10:12]))
sixty.min = append(sixty.min, mean(normalized.counts.atac[i,16:18]))
onetwenty.min = append(onetwenty.min,mean(normalized.counts.atac[i,4:6]))
oneeighty.min = append(oneeighty.min, mean(normalized.counts.atac[i,7:9]))
twoforty.min = append(twoforty.min, mean(normalized.counts.atac[i,13:15]))
genes = append(genes,rownames(normalized.counts.atac)[i])
}
atac.means = data.frame(row.names = genes, min.0 = zero.min, min.20 = twenty.min,
min.40 = forty.min, min.60 = sixty.min, min.120 = onetwenty.min,
min.180 = oneeighty.min, min.240 = twoforty.min)
save(atac.means,file='atac.means.Rdata')
df = merge(df,atac.means,by='row.names',all=TRUE)
rownames(df) = df[,1]
df = df[,-1]
Add cluster information
print('Adding Cluster Info')
print(head(df))
#up.flat - 9,23,17,11
#grad.up - 5,8,10,1
#up.down - 6,2,12
#grad.down - 7,3,4,13,14
#down.up - 24
supercluster.df = data.frame(cluster=c(9,23,17,11,5,8,10,1,6,2,12,7,3,4,13,14,24),
supercluster=c(rep('up.flat',4),
rep('grad.up',4),
rep('up.down',3),
rep('grad.down',5),
rep('down.up',1)))
cluster.df = data.frame()
for (file in Sys.glob(file.path(paste0('cluster_bed_cluster*.bed')))) {
cluster = as.numeric(strsplit(strsplit(file,'cluster_bed_cluster')[[1]][2],'.bed')[[1]][1])
print(cluster)
sc = supercluster.df[supercluster.df$cluster == cluster,]$supercluster
x = read.table(file)
x$location = paste0(x$V1,':',x$V2,'-',x$V3)
cluster.df = rbind(cluster.df,data.frame(peak=x$location,cluster=cluster,supercluster=sc))
}
df = merge(df,cluster.df,by.x='row.names',by.y=1,all.x=TRUE)
rownames(df) = df[,1]
df = df[,-1]
Add transcription factor family scores
print('Adding TF Family Scores')
print(head(df))
scores.df = data.frame()
for (file in Sys.glob('fimo_composites/*_fimo_all.bed')) {
factor = strsplit(strsplit(file,'/')[[1]][2],'_fimo_all.bed')[[1]][1]
print(factor)
x = read.table(file,sep='\t')
x = x[x$V5 != -1,]
if(factor %in% c('SP','KLF')) {
y = aggregate(as.numeric(V7)~V1+V2+V3, data=x, FUN=sum)
} else {
y = aggregate(as.numeric(V8)~V1+V2+V3, data=x, FUN=sum)
}
colnames(y) = c('chr', 'start', 'end', factor)
rownames(y) = paste0(y[,1], ':', y[,2], '-', y[,3])
scores.df = rbind(scores.df,data.frame(peak = rownames(y),score=y[,4],factor = factor))
}
fimo.scores.all.atac = data.frame(peak = unique(scores.df$peak))
for(factor in unique(scores.df$factor)) {
temp = scores.df[scores.df$factor == factor,]
temp = temp[,c(1,2)]
fimo.scores.all.atac = merge(fimo.scores.all.atac,temp,by='peak',all.x=TRUE)
colnames(fimo.scores.all.atac)[ncol(fimo.scores.all.atac)] = factor
}
rownames(fimo.scores.all.atac) = fimo.scores.all.atac$peak
fimo.scores.all.atac = fimo.scores.all.atac[,-1]
save(fimo.scores.all.atac,file='fimo.scores.all.atac.Rdata')
df = merge(df,fimo.scores.all.atac,by = 'row.names',all = TRUE)
rownames(df) = df$Row.names
df = df[,-1]
Add pairwise comparisons
print('Adding Pairwise Comparisons')
print(head(df))
load('df.preadipo.Rdata')
time.pts.key = data.frame(time.pts=c(rep(20,3),rep(120,3),rep(180,3),rep(40,3),
rep(240,3),rep(60,3),rep(0,3)),
cols = c(1:21))
time.pts = c(0,20,40,60,120,180,240)
comparisons.df = data.frame()
for (i in 1:(length(time.pts)-1)) {
for (j in (i+1):length(time.pts)) {
print(paste0(time.pts[j],'.v.',time.pts[i]))
a = time.pts.key[time.pts.key$time.pts == time.pts[i],]$cols
b = time.pts.key[time.pts.key$time.pts == time.pts[j],]$cols
merged.counts.small = df.preadipo[,c(a,b)]
# number of replicates per condition
unt = 3
trt = 3
sample.conditions = factor(c(rep("untreated",unt), rep("treated",trt)),
levels=c("untreated","treated"))
mm.deseq.counts.table = DESeqDataSetFromMatrix(merged.counts.small,
DataFrame(sample.conditions), ~ sample.conditions)
mm.atac = mm.deseq.counts.table
atac.size.factors = estimateSizeFactorsForMatrix(merged.counts.small)
sizeFactors(mm.atac) = atac.size.factors
mm.atac = estimateDispersions(mm.atac)
mm.atac = nbinomWaldTest(mm.atac)
res.mm.atac = results(mm.atac)
lattice = categorize.deseq.df(res.mm.atac, fdr = 0.001, log2fold = 0.0, treat = '')
lattice = as.data.frame(lattice[,c(2,6,7)])
colnames(lattice) = paste0(colnames(lattice),'.',time.pts[j],'.v.',time.pts[i])
comparisons.df = merge(comparisons.df,lattice,by='row.names',all=TRUE)
rownames(comparisons.df) = comparisons.df$Row.names
comparisons.df = comparisons.df[,-1]
}
}
save(comparisons.df,file='comparisons.df.Rdata')
df = merge(df,comparisons.df,by = 'row.names',all = TRUE)
rownames(df) = df$Row.names
df = df[,-1]
Add baseMean and overall time course info
print('Add baseMean and overall time course info')
load('res.lrt.Rdata')
res.lrt = as.data.frame(res.lrt[,c(1,2,6)])
res.lrt$response = 'Nondynamic'
res.lrt[res.lrt$padj < 0.00000001 & !is.na(res.lrt$padj),]$response = 'Dynamic'
colnames(res.lrt) = paste0(colnames(res.lrt),'.time.course')
df = merge(df,res.lrt,by = 'row.names',all = TRUE)
rownames(df) = paste0(df$chr,':',df$start,'-',df$end)
df = df[,-1]
Add distribution
print('Add distribution')
df$distribution = 'Intergenic'
x = read.table('all_ATAC_peaks_intragenic.bed')
x$location = paste0(x$V1,':',x$V2,'-',x$V3)
df[rownames(df) %in% x$location,]$distribution = 'Intragenic'
x = read.table('all_ATAC_peaks_promoters.bed')
x$location = paste0(x$V1,':',x$V2,'-',x$V3)
df[rownames(df) %in% x$location,]$distribution = 'Promoter'
final.atac.all.df=df
save(final.atac.all.df,file='final.atac.all.df.Rdata')
Figure 1
Generate 'fimo.scores.atac.Rdata' object
The following Rscript generates plots showing that distinct transcription factors drive opening and closing of chromatin in early adipogenesis.
github raw
library(ggplot2)
library(lattice)
library(pheatmap)
library(fabricatr)
library(ggseqlogo)
load('/scratch/bhn9by/ATAC/plot.df.atac.Rdata')
dir = '/scratch/bhn9by/ATAC/fimo_composites/'
setwd(dir)
#generate 'fimo.scores.atac.Rdata' object
res = unique(read.table('/scratch/bhn9by/ATAC/dynamic_peaks.bed'))
colnames(res) = c('chr', 'start', 'end')
res$start = as.numeric(as.character(res$start))
res$end = as.numeric(as.character(res$end))
rownames(res) = paste0(res[,1], ':', res[,2], '-', res[,3])
#main figure families
for(bed.file in Sys.glob(file.path(paste0(dir,'main_figure_beds/*_fimo.bed')))) {
factor.name = strsplit(bed.file, "/")[[1]]
factor.name = strsplit(factor.name[length(factor.name)],
'_fimo.bed')[[1]][1]
print(factor.name)
x = read.table(bed.file, stringsAsFactors=FALSE)
x = x[x[,6] != -1,]
y = aggregate(as.numeric(V8)~V1+V2+V3, data=x, FUN=sum)
colnames(y) = c('chr', 'start', 'end', factor.name)
rownames(y) = paste0(y[,1], ':', y[,2], '-', y[,3])
func <- function(peak) {
if(peak %in% rownames(y)) {
return(y[rownames(y) == peak,ncol(y)])
} else {
return(NA)
}
}
res[,ncol(res)+1] = sapply(rownames(res),func)
colnames(res)[ncol(res)] = factor.name
}
res = res[,-c(1:3)]
#add in SP and KLF after separation
load('/scratch/bhn9by/ATAC/SP_KLF_split/sp.klf.scores.atac.Rdata')
x = sp.klf.scores.atac[,4:5]
colnames(x) = c('SP','KLF')
res = merge(res,x,by='row.names',all=TRUE)
rownames(res) = res[,1]
res = res[,-1]
fimo.scores.atac = res
save(fimo.scores.atac, file = '/scratch/bhn9by/ATAC/fimo.scores.atac.Rdata')
#save as bed files
for(i in 1:ncol(fimo.scores.atac)) {
temp = fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]
chr = sapply(strsplit(rownames(temp), ':'), '[', 1)
x = sapply(strsplit(rownames(temp), ':'), '[', 2)
start = sapply(strsplit(x, '-'), '[', 1)
end = sapply(strsplit(x, '-'), '[', 2)
bed = data.frame(chr, start, end, score = temp[,i])
write.table(bed, file = paste0(colnames(fimo.scores.atac)[i],'_instances.bed'),
row.names=F,col.names=F,quote=F,sep='\t')
}
Plot all family peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
temp = plot.df.atac[plot.df.atac$genes %in% rownames(fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$genes = as.factor(plot.df$genes)
cat.colours = plot.df[plot.df$merge == 'one_groupt0', c(1,10)]
cat.colours$genes <- as.factor(cat.colours$genes)
cat.colours$supercluster <- as.factor(cat.colours$supercluster)
cat.colours$colour[cat.colours$supercluster == 'up.flat'] <- '#FF000008'
cat.colours$colour[cat.colours$supercluster == 'up.down'] <- '#FF000008'
cat.colours$colour[cat.colours$supercluster == 'gradual.up'] <- '#FF000008'
cat.colours$colour[cat.colours$supercluster == 'gradual.down'] <- '#0000FF08'
cat.colours$colour[cat.colours$supercluster == 'down.up'] <- '#0000FF08'
cat.colours$colour <- as.factor(cat.colours$colour)
cat.colours <- cat.colours[match(levels(plot.df$genes), cat.colours$genes), ]
pdf(file = 'all_families_dynamic_accessibility.pdf',width=14,height=14)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | family, group = genes, data = plot.df,
type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45), y =list(cex=1.0, relation="free")),
aspect=1.0,
layout = c(3,2),
between=list(y=0.5, x=0.5),
index.cond=list(c(4:6,
1:3)),
main = list(label = 'All Family Motifs in Dynamic Peaks', cex = 1.5),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = as.character(cat.colours$colour),
lwd=c(1),lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
#panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15, do.out = FALSE)
#panel.spline(x, y, col ='blue', lwd =2.0, ...)
})
)
dev.off()
Plot all families bar plot, including no family category
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
temp = plot.df.atac[plot.df.atac$genes %in% rownames(fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$genes = as.factor(plot.df$genes)
no.fam = rownames(fimo.scores.atac[which(rowSums(is.na(fimo.scores.atac)) == ncol(fimo.scores.atac)),])
temp = plot.df.atac[plot.df.atac$genes %in% no.fam,]
temp$family = 'No Family'
plot.df = rbind(plot.df,temp)
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
activated = table(plot.df[plot.df$status == 'Activated',]$family) / 7
repressed = table(plot.df[plot.df$status == 'Repressed',]$family) / 7
df = data.frame(names = c(names(activated),names(repressed)),
num = c(as.vector(activated),as.vector(repressed)),
cond = c(rep('Activated',length(activated)),rep('Repressed',length(repressed))),
index = rep(as.vector(table(plot.df$family)),2))
df[df$names == 'No Family',]$index = min(df$index) - 1
pdf(file='all.peaks.with.no.fam.bar.pdf',width=12,height=7)
print(
ggplot(df,aes(x = reorder(names,-index),y = num,fill=cond)) +
geom_bar(stat='identity',position='stack',color='black') +
#geom_text(aes(label=num),position=position_stack(vjust = 0.5),size=6) +
labs(title = 'Number of Peaks For Each Motif Family', y = 'Number of Peaks', x = NULL, fill = NULL) +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle=45,size=20,hjust=.99,vjust=1,color='black',face='bold'),
axis.text.y = element_text(size=20,face='bold',color='black'),
axis.title.y = element_text(size=18,face='bold'),
legend.text = element_text(size=18,face='bold'),
plot.title = element_text(size=22,face='bold',hjust=0.5)) +
scale_fill_manual(values = c("indianred1","dodgerblue"))
)
dev.off()
Plot traces of isolated peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
scores.temp = fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]
scores.temp = scores.temp[,-i]
scores.temp = scores.temp[which(rowSums(is.na(scores.temp)) == ncol(scores.temp)),]
temp = plot.df.atac[plot.df.atac$genes %in% rownames(scores.temp),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$genes = as.factor(plot.df$genes)
cat.colours = plot.df[plot.df$merge == 'one_groupt0', c(1,10)]
cat.colours$genes <- as.factor(cat.colours$genes)
cat.colours$supercluster <- as.factor(cat.colours$supercluster)
cat.colours$colour[cat.colours$supercluster == 'up.flat'] <- '#FF00001A'
cat.colours$colour[cat.colours$supercluster == 'up.down'] <- '#FF00001A'
cat.colours$colour[cat.colours$supercluster == 'gradual.up'] <- '#FF00001A'
cat.colours$colour[cat.colours$supercluster == 'gradual.down'] <- '#0000FF1A'
cat.colours$colour[cat.colours$supercluster == 'down.up'] <- '#0000FF1A'
cat.colours$colour <- as.factor(cat.colours$colour)
cat.colours <- cat.colours[match(levels(plot.df$genes), cat.colours$genes), ]
pdf(file = 'all_isolated_families_dynamic_accessibility.pdf',width=14,height=14)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | family, group = genes, data = plot.df, type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.5,relation = "free", rot = 45), y =list(cex=1.5, relation="free")),
aspect=1.0,
layout = c(3,2),
between=list(y=0.5, x=0.5),
index.cond=list(c(4:6,
1:3)),
main = list(label = 'All Isolated Family Motifs in Dynamic Peaks', cex = 2.0),
ylab = list(label = 'Normalized ATAC signal', cex =2.0),
xlab = list(label = 'Time (minutes)', cex =2.0),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex = 2.0),
strip.background=list(col="grey80"),
superpose.line = list(col = as.character(cat.colours$colour), lwd=c(1),lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
#panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15, do.out = FALSE)
#panel.spline(x, y, col ='blue', lwd =2.0, ...)
})
)
dev.off()
Plot bar plot of isolated peaks
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
activated = table(plot.df[plot.df$status == 'Activated',]$family) / 7
repressed = table(plot.df[plot.df$status == 'Repressed',]$family) / 7
df = data.frame(names = c(names(activated),names(repressed)),
num = c(as.vector(activated),as.vector(repressed)),
cond = c(rep('Activated',length(activated)),rep('Repressed',length(repressed))),
index = rep(as.vector(table(plot.df$family)),2))
pdf(file='isolated.peaks.bar.pdf',width=12,height=7)
print(
ggplot(df,aes(x = reorder(names,-index),y = num,fill=cond)) +
geom_bar(stat='identity',position='stack',color='black') +
#geom_text(aes(label=num),position=position_stack(vjust = 0.5),size=6) +
labs(title = 'Number of Isolated Peaks For Each Motif Family',
y = 'Number of Peaks', x = NULL, fill = NULL) +
theme_minimal() +
theme(panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(angle=45,size=12,hjust=.99,vjust=1,color='black',face='bold'),
axis.text.y = element_text(size=12,face='bold',color='black'),
axis.title.y = element_text(size=14,face='bold'),
legend.text = element_text(size=12,face='bold'),
plot.title = element_text(size=16,face='bold',hjust=0.5)) +
scale_fill_manual(values = c("indianred1","dodgerblue"))
)
dev.off()
Plot family heatmaps
prop.fam.df = data.frame(matrix(nrow=ncol(fimo.scores.atac),ncol=ncol(fimo.scores.atac)),
row.names=colnames(fimo.scores.atac))
total.fam.df = data.frame(matrix(nrow=ncol(fimo.scores.atac),ncol=ncol(fimo.scores.atac)),
row.names=colnames(fimo.scores.atac))
colnames(prop.fam.df) = colnames(fimo.scores.atac)
colnames(total.fam.df) = colnames=colnames(fimo.scores.atac)
for(i in 1:ncol(fimo.scores.atac)) {
x = fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]
for(j in 1:ncol(x)) {
z = x[!is.na(x[,j]),]
total.fam.df[j,i] = nrow(z)
prop.fam.df[j,i] = 100*(nrow(z)/nrow(fimo.scores.atac[!is.na(fimo.scores.atac[,j]),]))
}
}
rownames(prop.fam.df) = colnames(fimo.scores.atac)
colnames(prop.fam.df) = colnames(fimo.scores.atac)
prop.fam.df = prop.fam.df[order(colnames(prop.fam.df)),]
pdf(file='prop.fam.heatmap.pdf')
pheatmap(prop.fam.df,cluster_rows=FALSE, show_rownames=TRUE,cluster_cols=FALSE,fontsize=12)
dev.off()
pdf(file='total.fam.heatmap.pdf')
pheatmap(total.fam.df,cluster_rows=FALSE, show_rownames=TRUE,cluster_cols=FALSE,fontsize=12)
dev.off()
Plot up/down split occurrences
plot.df = plot.df.atac
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
for(i in 1:ncol(fimo.scores.atac)) {
up.num = c()
down.num = c()
for(j in 1:ncol(fimo.scores.atac)) {
x = fimo.scores.atac[!is.na(fimo.scores.atac[,j]),]
y = plot.df[plot.df$genes %in% rownames(x),]
up = fimo.scores.atac[rownames(fimo.scores.atac) %in% y[y$status == 'Activated',]$genes,]
down = fimo.scores.atac[rownames(fimo.scores.atac) %in% y[y$status == 'Repressed',]$genes,]
up.num = append(up.num, (nrow(up[!is.na(up[,i]),])/nrow(up))*100)
down.num = append(down.num, (nrow(down[!is.na(down[,i]),])/nrow(down))*100)
}
df = data.frame(names = rep(colnames(fimo.scores.atac),2),
#names = rep(c('NRF','TWIST','STAT','TFAP2','ZNF263','AP1','bHLH','GR',
'CTCFL','SP/KLF','ELK/ETV','GFX/ZBTB33','Maz','NFY'),2),
num = c(up.num,down.num),
cond = c(rep('Activated',length(up.num)),rep('Repressed',length(down.num))))
#index = rep(c(10:14,1:9),2))
name = df[i,]$names
df = df[-which(df$names == name),]
df$names = as.factor(df$names)
pdf(file=paste0('percent.',name,'.in.other.families.pdf'),width=8,height=5)
print(
ggplot(df,aes(x = names,y = num,fill=cond)) +
geom_bar(stat='identity',position='dodge',color='black') +
labs(title = paste0('% of ',name,' Occurrences in Other Family Motifs'),
y = paste0('% of Occurrences with ',name),
x = 'Motif Family',
fill = 'Direction') +
theme_minimal() +
scale_x_discrete(labels= levels(df$names)) +
theme(panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5,face='bold'),
axis.text.x = element_text(angle=45,size=12,hjust=.99,vjust=1,color='black',face='bold'),
axis.title.x = element_text(size=14,face='bold'),
axis.text.y = element_text(size=12,face='bold',color='black'),
axis.title.y = element_text(size=14,face='bold'),
legend.title = element_text(size=14,face='bold',color='black'),
legend.text = element_text(size=12,face='bold',color='black'),
axis.ticks = element_blank()) +
scale_fill_manual(values = c("indianred1","dodgerblue"))
)
dev.off()
}
Plot FIMO scores box and whisker plots for isolated peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
scores.temp = fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]
scores.temp = scores.temp[,-i]
scores.temp = scores.temp[which(rowSums(is.na(scores.temp)) == ncol(scores.temp)),]
temp = plot.df.atac[plot.df.atac$genes %in% rownames(scores.temp),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
plot.df = unique(plot.df[,c(1,11,12)])
func <- function(peak) {
family = plot.df[plot.df$genes == peak,]$family
colnum = which(colnames(fimo.scores.atac) == family)
score = fimo.scores.atac[rownames(fimo.scores.atac) == peak,colnum]
return(score)
}
plot.df$score = sapply(plot.df$genes,func)
pdf(file = paste0('fimo.scores.isolated.peaks.bw.pdf'),width=12,height=8)
trellis.par.set(box.umbrella = list(lty = 1, col=c("red", "blue"), lwd=2),
box.rectangle = list( lwd=2.0, col=c("red", "blue"), alpha = 1.0),
plot.symbol = list(col=c("red", "blue"), lwd=2.0, pch ='.'))
print(
bwplot(log(as.numeric(as.character(score)), base = 10) ~ status | family, data = plot.df,
horizontal = FALSE, pch = '|', do.out = FALSE,
scales=list(x=list(cex=1.0, relation = "free", rot = 45), y = list(cex=1.0, relation="free")),
aspect=2.0,
between=list(y=0.5, x=0.5),
index.cond=list(c(4:6,1:3)),
ylab = expression('log'[10]* paste(Sigma,'(motif scores)')),
xlab = expression('ATAC Peak category'),
#manually settign to avoid outlier, since do.out = FALSE
#ylim = list(c(0.9, 1.9), c(0.9, 1.7), c(0.8, 1.8), c(1, 2.15), c(0.96, 1.4)),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex =0.5),
#I want to change the background strip to the corresponding motif color
strip.background=list(col=c("grey80"))))
)
dev.off()
Plot FIMO scores box and whisker plots for all peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
temp = plot.df.atac[plot.df.atac$genes %in% rownames(fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
plot.df = unique(plot.df[,c(1,11,12)])
func <- function(peak,family) {
#family = plot.df[plot.df$genes == peak,]$family
colnum = which(colnames(fimo.scores.atac) == family)
score = fimo.scores.atac[rownames(fimo.scores.atac) == peak,colnum]
return(score)
}
plot.df$score = mapply(func,plot.df$genes,family=plot.df$family)
pdf(file = paste0('fimo.scores.all.peaks.bw.pdf'),width=12,height=8)
trellis.par.set(box.umbrella = list(lty = 1, col=c("red", "blue"), lwd=2),
box.rectangle = list( lwd=2.0, col=c("red", "blue"), alpha = 1.0),
plot.symbol = list(col=c("red", "blue"), lwd=2.0, pch ='.'))
print(
bwplot(log(as.numeric(as.character(score)), base = 10) ~ status | family, data = plot.df,
horizontal = FALSE, pch = '|', do.out = FALSE,
scales=list(x=list(cex=1.0, relation = "free", rot = 45), y = list(cex=1.0, relation="free")),
aspect=2.0,
between=list(y=0.5, x=0.5),
index.cond=list(c(4:6,1:3)),
ylab = expression('log'[10]* paste(Sigma,'(motif scores)')),
xlab = expression('ATAC Peak category'),
#manually settign to avoid outlier, since do.out = FALSE
#ylim = list(c(0.9, 1.9), c(0.9, 1.7), c(0.8, 1.8), c(1, 2.15), c(0.96, 1.4)),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex =0.5),
#I want to change the background strip to the corresponding motif color
strip.background=list(col=c("grey80"))))
)
dev.off()
Plot split on FIMO scores for isolated peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
scores.temp = fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]
scores.temp = scores.temp[,-i]
scores.temp = scores.temp[which(rowSums(is.na(scores.temp)) == ncol(scores.temp)),]
temp = plot.df.atac[plot.df.atac$genes %in% rownames(scores.temp),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
func <- function(peak) {
family = unique(plot.df[plot.df$genes == peak,]$family)
colnum = which(colnames(fimo.scores.atac) == family)
score = fimo.scores.atac[rownames(fimo.scores.atac) == peak,colnum]
return(score)
}
plot.df$score = sapply(plot.df$genes,func)
for(fam in unique(plot.df$family)) {
df = plot.df[plot.df$family == fam,]
fam = gsub('/','.',fam)
print(fam)
df$quantile = paste0('Quantile ',split_quantile(x = as.numeric(df$score),type = 5))
df$genes = as.factor(df$genes)
cat.colours = df
cat.colours$genes <- as.factor(cat.colours$genes)
cat.colours$status <- as.factor(cat.colours$status)
cat.colours$colour[cat.colours$status == 'Activated'] <- '#FF000008'
cat.colours$colour[cat.colours$status == 'Repressed'] <- '#0000FF08'
cat.colours$colour <- as.factor(cat.colours$colour)
cat.colours <- cat.colours[match(levels(df$genes), cat.colours$genes),]
pdf(file = paste0(fam,'_isolated_split_on_FIMO_scores.pdf'),width=11,height=4)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | quantile, group = genes, data = df,
type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45), y = list(cex=1.0, relation="free")),
aspect=1.0,
layout = c(5,1),
between=list(y=0.5, x=0.5),
main = list(label = paste0(fam, ' Isolated Motifs Stratified By FIMO Score'), cex = 1.5),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = as.character(cat.colours$colour),
lwd=c(1),lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15, do.out = FALSE)
panel.spline(x, y, col = 'grey70', lwd =3.0, ...)
})
)
dev.off()
}
Plot split on FIMO scores for all peaks
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
temp = plot.df.atac[plot.df.atac$genes %in% rownames(fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
func <- function(peak,family) {
#family = plot.df[plot.df$genes == peak,]$family
colnum = which(colnames(fimo.scores.atac) == family)
score = fimo.scores.atac[rownames(fimo.scores.atac) == peak,colnum]
return(score)
}
plot.df$score = mapply(func,plot.df$genes,family=plot.df$family)
for(fam in unique(plot.df$family)) {
df = plot.df[plot.df$family == fam,]
print(fam)
df$quantile = paste0('Quantile ',split_quantile(x = as.numeric(df$score),type = 5))
df$genes = as.factor(df$genes)
cat.colours = df
cat.colours$genes <- as.factor(cat.colours$genes)
cat.colours$status <- as.factor(cat.colours$status)
cat.colours$colour[cat.colours$status == 'Activated'] <- '#FF000008'
cat.colours$colour[cat.colours$status == 'Repressed'] <- '#0000FF08'
cat.colours$colour <- as.factor(cat.colours$colour)
cat.colours <- cat.colours[match(levels(df$genes), cat.colours$genes),]
pdf(file = paste0(fam,'_all_split_on_FIMO_scores.pdf'),width=11,height=4)
trellis.par.set(box.umbrella = list(lty = 1, col="black", lwd=1),
box.rectangle = list( lwd=1.0, col="black", alpha = 1.0),
plot.symbol = list(col="black", lwd=1.0, pch ='.'))
print(
xyplot(value ~ sample.conditions | quantile, group = genes, data = df,
type = c('l'),#type = c('l','p'),
scales=list(x=list(cex=1.0,relation = "free", rot = 45), y = list(cex=1.0, relation="free")),
aspect=1.0,
layout = c(5,1),
between=list(y=0.5, x=0.5),
main = list(label = paste0(fam, ' All Motifs Stratified By FIMO Score'), cex = 1.5),
ylab = list(label = 'Normalized ATAC signal', cex =1.0),
xlab = list(label = 'Time (minutes)', cex =1.0),
par.settings = list(superpose.symbol = list(pch = c(16),col=c('grey20'), cex =0.5),
strip.background=list(col="grey80"),
superpose.line = list(col = as.character(cat.colours$colour),
lwd=c(1),lty = c(1))),
panel = function(x, y, ...) {
panel.xyplot(x, y, ...)
panel.bwplot(x, y, pch = '|', horizontal = FALSE, box.width = 15, do.out = FALSE)
panel.spline(x, y, col = 'grey70', lwd =3.0, ...)
})
)
dev.off()
}
Generate composites for all activated and repressed peaks
pswm.fimo <- function(fimo.in, out = 'outfilename', nm = 'bHLH_Activated', rc = FALSE) {
posnum = nchar(fimo.in$sequence[1])
fimo.in$sequence = toupper(fimo.in$sequence)
col.matrix = matrix()
for (g in 1:posnum){
itnum = lapply(strsplit(as.character(fimo.in$sequence), ''), "[", g)
if (g == 1) {
col.matrix = itnum
} else {
col.matrix = cbind(col.matrix, itnum)
}
}
a.nuc = sapply(1:posnum, function(x) sum(col.matrix[,x] == "A"))
t.nuc = sapply(1:posnum, function(x) sum(col.matrix[,x] == "T"))
c.nuc = sapply(1:posnum, function(x) sum(col.matrix[,x] == "C"))
g.nuc = sapply(1:posnum, function(x) sum(col.matrix[,x] == "G"))
pswm = cbind(a.nuc, c.nuc, g.nuc, t.nuc)
print(pswm)
outfile = file(paste0(out, '.txt'))
on.exit(close(outfile))
writeLines(c("MEME version 4", "ALPHABET= ACGT", "strands: + -", " ",
"Background letter frequencies (from uniform background):",
"A 0.30000 C 0.20000 G 0.20000 T 0.30000", paste("MOTIF", out), " ",
paste("letter-probability matrix: alength= 4 w=", posnum)), outfile)
pswm = pswm/rowSums(pswm)
if (rc == "TRUE"){
pswm<- pswm[nrow(pswm):1,ncol(pswm):1]
} else {}
write.table(pswm, file = paste0(out, '.txt'), append = TRUE,
quote=FALSE, row.names =FALSE, col.names = FALSE)
pswm = t(pswm)
rownames(pswm) = c('A', 'C', 'G', 'T')
return(pswm)
#
# system(paste0('/Users/guertinlab/meme/libexec/meme-5.1.1/ceqlogo -i ', out,
'.txt -m Composite -o ', nm, '.eps'))
# system(paste0('/Users/guertinlab/meme/libexec/meme-5.1.1/ceqlogo -i ', out,
'.txt -m Composite -o ', nm, '.rc.eps -r'))
}
plot.df = plot.df.atac
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
count = -1
vec.names = c()
df.seq = as.data.frame(matrix(ncol=5, nrow=0),stringsAsFactors = FALSE)
colnames(df.seq) = c('chr', 'start', 'end', 'sequence', 'factor')
for(bed.file in Sys.glob(file.path(paste0(dir,'main_figure_beds/*fimo.bed')))) {
#print(bed.file)
count = count + 1
factor.name = strsplit(bed.file, "/")[[1]]
factor.name = strsplit(factor.name[length(factor.name)],
'_fimo.bed')[[1]][1]
print(factor.name)
x = read.table(bed.file, stringsAsFactors=FALSE)
x = x[x[,6] != -1,]
x = x[,c(1,2,3,10)]
x$factor = factor.name
colnames(x) = c('chr', 'start', 'end', 'sequence', 'factor')
x$re = paste0(x[,1], ':', x[,2], '-', x[,3])
df.seq = rbind(df.seq, x)
}
#now add SP and KLF
sp = read.table('/scratch/bhn9by/ATAC/SP_KLF_split/output_sp1.txt', stringsAsFactors=FALSE,
sep = '\t', header = TRUE)[,c(3,10)]
out.sp <- strsplit(as.character(sp[,1]), ':')
sp <- data.frame(do.call(rbind, out.sp, quote=FALSE), sp[,c(2)])
sp <- sp[,c(1:3,10)]
colnames(sp) <- c('chr','start','end','sequence')
sp$factor = 'SP'
sp$re = paste0(sp$chr,':',sp$start,'-',sp$end)
df.seq = rbind(df.seq,sp)
klf = read.table('/scratch/bhn9by/ATAC/SP_KLF_split/output_klf.txt', stringsAsFactors=FALSE,
sep = '\t', header = TRUE)[,c(3,10)]
out.klf <- strsplit(as.character(klf[,1]), ':')
klf <- data.frame(do.call(rbind, out.klf, quote=FALSE), klf[,c(2)])
klf <- klf[,c(1:3,10)]
colnames(klf) <- c('chr','start','end','sequence')
klf$factor = 'KLF'
klf$re = paste0(klf$chr,':',klf$start,'-',klf$end)
df.seq = rbind(df.seq,klf)
func <- function(peak) {
return(unique(plot.df[plot.df$genes == peak,]$status))
}
df.seq$status = sapply(df.seq$re,func)
for(i in unique(df.seq$factor)) {
act = df.seq[df.seq$factor == i & df.seq$status == 'Activated',]
rep = df.seq[df.seq$factor == i & df.seq$status == 'Repressed',]
pswm.fimo(act, out = paste0(i,'_activated'))
pswm.fimo(rep, out = paste0(i,'_repressed'))
}
Generating bigWig for motif enrichment plot
github raw
#prepare bigWigs for motif enrichment plot
cd /scratch/bhn9by/ATAC/fimo_composites/main_figure_beds
cat SP_unsorted.bed | sort -k1,1 -k2,2n > SP_2M.bed
cat KLF_unsorted.bed | sort -k1,1 -k2,2n > KLF_2M.bed
module load ucsc-tools/3.7.4 gcc/9.2.0 bedtools/2.29.2
intersectBed -loj -a ../../all_peaks.bed -b SP_2M.bed > ../SP_fimo_all.bed
intersectBed -loj -a ../../all_peaks.bed -b KLF_2M.bed > ../KLF_fimo_all.bed
intersectBed -loj -a ../../dynamic_peaks.bed -b SP_2M.bed > SP_fimo.bed
intersectBed -loj -a ../../dynamic_peaks.bed -b KLF_2M.bed > KLF_fimo.bed
for bed in *2M.bed
do
name=$(echo $bed | awk -F"/" '{print $NF}' | awk -F"_2M.bed" '{print $1}')
echo $name
#summing scores of motifs w/in overlapping genomic interval
cat $bed | mergeBed -i stdin -c 4 -o sum > ${name}_merged_2M.bed
bedGraphToBigWig ${name}_merged_2M.bed ../../mm10.chrom.sizes ${name}_mm10_instances.bigWig
rm ${name}_merged_2M.bed
done
#CAUTION: If you want to rerun the preceding post.composite.fimo.R,
#rm SP_fimo.bed KLF_fimo.bed
#else, they will error
Plot motif enrichment around summits
github raw
library(lattice)
library(bigWig)
dir = '/scratch/bhn9by/ATAC/fimo_composites/'
setwd(dir)
plot.fimo.lattice <- function(dat, fact = 'Motif', summit = 'Hypersensitivity Summit', class= '',
num.m = -200, num.p =90, y.low =0, y.high = 0.2,
col.lines = c(rgb(0,0,1,1/2), rgb(1,0,0,1/2),
rgb(0.1,0.5,0.05,1/2), rgb(0,0,0,1/2),
rgb(1/2,0,1/2,1/2), rgb(0,1/2,1/2,1/2), rgb(1/2,1/2,0,1/2)),
fill.poly = c(rgb(0,0,1,1/4), rgb(1,0,0,1/4), rgb(0.1,0.5,0.05,1/4),
rgb(0,0,0,1/4), rgb(1/2,0,1/2,1/4))) {
pdf('motif_enrichment_around_summits.pdf')#, width=6.83, height=3.5)
print(xyplot(density ~ range|tf, groups = category, data = dat, type = 'l',
scales=list(x=list(cex=0.8,relation = "free"),
y =list(cex=0.8,axs = 'i',relation = "free")),
xlim=c(num.m,num.p),
col = col.lines,
auto.key = list(points=F, lines=T, cex=0.8, columns = 2),
par.settings = list(superpose.symbol = list(pch = c(16), col=col.lines, cex =0.7),
superpose.line = list(col = col.lines, lwd=c(2,2,2,2,2,2),
lty = c(1,1,1,1,1,1,1,1,1))),
cex.axis=1.0,
par.strip.text=list(cex=0.9, font=1, col='black',font=2),
aspect=1.0,
between=list(y=0.5, x=0.5),
index.cond = list(c(4:6,1:3)),
lwd=2,
ylab = list(label = "Weighted Motif Density", cex =1,font=2),
xlab = list(label = 'Distance from ATAC-seq Peak Summit', cex =1,font=2),
strip = function(..., which.panel, bg) {
bg.col = 'grey'#c("blue","grey65","red")
strip.default(..., which.panel = which.panel,
bg = rep(bg.col, length = which.panel)[which.panel])
}
))
dev.off()
}
load('/scratch/bhn9by/ATAC/plot.df.atac.Rdata')
load('/scratch/bhn9by/ATAC/fimo.scores.atac.Rdata')
plot.df = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
temp = plot.df.atac[plot.df.atac$genes %in% rownames(fimo.scores.atac[!is.na(fimo.scores.atac[,i]),]),]
temp$family = colnames(fimo.scores.atac)[i]
plot.df = rbind(plot.df,temp)
}
plot.df$status = 'Activated'
plot.df[plot.df$supercluster == 'gradual.down' | plot.df$supercluster == 'down.up',]$status = 'Repressed'
all.fimo = data.frame(matrix(ncol = 4, nrow = 0))
colnames(all.fimo) = c('density', 'tf', 'category', 'range')
half.win = 600
file.suffix = '_mm10_instances.bigWig'
dir = '/scratch/bhn9by/ATAC/fimo_composites/main_figure_beds/'
decreased = plot.df[plot.df$status == 'Repressed',7:9]
decreased[,2] = as.numeric(decreased[,2])
decreased[,3] = as.numeric(decreased[,3])
decreased = bed.window(decreased,half.win)
increased = plot.df[plot.df$status == 'Activated',7:9]
increased[,2] = as.numeric(increased[,2])
increased[,3] = as.numeric(increased[,3])
increased = bed.window(increased,half.win)
not.different = read.table('/scratch/bhn9by/ATAC/nondynamic_peaks.bed')
not.different = not.different[not.different$V1 != 'chrM',]
not.different = bed.window(not.different,half.win)
all.fimo = data.frame()
for(i in 1:ncol(fimo.scores.atac)) {
factor = colnames(fimo.scores.atac)[i]
mod.bigWig = paste0(dir,factor,file.suffix)
factor.name = factor
print(factor.name)
loaded.bw = load.bigWig(mod.bigWig)
dec.inten = bed.step.probeQuery.bigWig(loaded.bw, decreased,
gap.value = 0, step = 10, as.matrix = TRUE)
dec.query.df = data.frame(cbind(colMeans(dec.inten), factor.name,
'Closed', seq(-half.win, (half.win-10), 10)), stringsAsFactors=F)
colnames(dec.query.df) = c('density', 'tf', 'category', 'range')
inc.inten = bed.step.probeQuery.bigWig(loaded.bw, increased,
gap.value = 0, step = 10, as.matrix = TRUE)
inc.query.df = data.frame(cbind(colMeans(inc.inten), factor.name,
'Opened', seq(-half.win,(half.win-10), 10)), stringsAsFactors=F)
colnames(inc.query.df) = c('density', 'tf', 'category', 'range')
ctrl.inten = bed.step.probeQuery.bigWig(loaded.bw, not.different,
gap.value = 0, step = 10, as.matrix = TRUE)
ctrl.query.df = data.frame(cbind(colMeans(ctrl.inten), factor.name,
'Nondynamic', seq(-half.win, (half.win-10), 10)), stringsAsFactors=F)
colnames(ctrl.query.df) = c('density', 'tf', 'category', 'range')
tf.all = rbind(dec.query.df, inc.query.df, ctrl.query.df)
all.fimo = rbind(all.fimo,tf.all)
}
all.fimo[,1] = as.numeric(all.fimo[,1])
all.fimo[,4] = as.numeric(all.fimo[,4])
plot.fimo.lattice(all.fimo, num.m = -500, num.p = 500,
col.lines = c('blue','grey65','red'))
Primary Transcript Annotation
Download required scripts
pTA.functions.R: this .R file provides functions for performing pTA
overlaps_remove.csv, overlaps_keepadjacent.csv, duplicate_remove.csv, duplicate_keepadjacent.csv: these tables provide names of genes to be kept or removed from pTA
github raw
#!/bin/bash
cd /scratch/bhn9by/primary_transcript_annotation
wget https://raw.githubusercontent.com/guertinlab/adipogenesis/master/Bao_code_chunks_PRO/misc_scripts/pTA.functions.R
wget https://raw.githubusercontent.com/guertinlab/adipogenesis/master/Bao_code_chunks_PRO/misc_scripts/overlaps_remove.csv
wget https://raw.githubusercontent.com/guertinlab/adipogenesis/master/Bao_code_chunks_PRO/misc_scripts/overlaps_keepadjacent.csv
wget https://raw.githubusercontent.com/guertinlab/adipogenesis/master/Bao_code_chunks_PRO/misc_scripts/duplicate_remove.csv
wget https://raw.githubusercontent.com/guertinlab/adipogenesis/master/Bao_code_chunks_PRO/misc_scripts/duplicate_keepadjacent.csv
#remove DOS \r\n\ artifact from .csv
sed -i 's/\r$//' *csv
Retrieve reference chromosomes files and merge bigWig files
github raw
module load bioconda gcc/7.1.0 openmpi/3.1.4 R/4.0.0
conda activate myenv2
cd /scratch/bhn9by/PRO
# should not run wget in slurm--download connection is often severed
wget https://hgdownload-test.gi.ucsc.edu/goldenPath/mm10/bigZips/mm10.chrom.sizes
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
gunzip gencode.vM25.annotation.gtf.gz
# get the first exons for protein coding genes
grep 'transcript_type "protein_coding"' gencode.vM25.annotation.gtf | \
awk '{if($3=="exon"){print $0}} ' | \
grep -w "exon_number 1" | \
cut -f1,4,5,7,9 | tr ";" "\t" | \
awk '{for(i=5;i<=NF;i++){if($i~/^gene_name/){a=$(i+1)}} print $1,$2,$3,a,"na",$4}' | \
tr " " "\t" | tr -d '"' > gencode.mm10.firstExon.bed
# get all transcripts for protein coding genes
grep 'transcript_type "protein_coding"' gencode.vM25.annotation.gtf | \
awk '{if($3=="transcript"){print $0}} ' | \
cut -f1,4,5,7,9 | tr ";" "\t" | \
awk '{for(i=5;i<=NF;i++){if($i~/^gene_name/){a=$(i+1)}} print $1,$2,$3,a,"na",$4}' | \
tr " " "\t" | tr -d '"' > gencode.mm10.transcript.bed
# chrom sizes file
chromsizes=mm10.chrom.sizes
# merge plus
plusfiles=$(ls *plus*bigWig)
bigWigMerge ${plusfiles} pro_plus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_plus_merged.bedGraph > pro_plus_merged_sorted.bedGraph
bedGraphToBigWig pro_plus_merged_sorted.bedGraph ${chromsizes} pro_plus_merged.bigWig
# merge minus
minusfiles=$(ls *minus*bigWig)
bigWigMerge ${minusfiles} pro_minus_merged.bedGraph
LC_COLLATE=C sort -k1,1 -k2,2n pro_minus_merged.bedGraph > pro_minus_merged_sorted.bedGraph
bedGraphToBigWig pro_minus_merged_sorted.bedGraph ${chromsizes} pro_minus_merged.bigWig
Annotating Primary Transcript
pTA Rscript takes ~2 hour to compile. Run as a slurm job for your own convenience.
github raw
library(devtools)
library(NMF)
library(dplyr)
library(bigWig)
library(pracma)
library(RColorBrewer)
#install_github("WarrenDavidAnderson/genomicsRpackage/primaryTranscriptAnnotation")
library(primaryTranscriptAnnotation)
setwd('/scratch/bhn9by/PRO/primary_transcript_annotation')
source('pTA.functions.R')
# import data for first exons, annotate, and remove duplicate transcripts
fname = "../gencode.mm10.firstExon.bed"
dat0 = read.table(fname,header=F,stringsAsFactors=F)
names(dat0) = c('chr', 'start', 'end', 'gene', 'xy', 'strand')
dat0 = unique(dat0)
gencode.firstExon = dat0
# import data for all transcripts, annotate, and remove duplicate transcripts
fname = "../gencode.mm10.transcript.bed"
dat0 = read.table(fname,header=F,stringsAsFactors=F)
names(dat0) = c('chr', 'start', 'end', 'gene', 'xy', 'strand')
dat0 = unique(dat0)
gencode.transcript = dat0
# chromosome sizes
chrom.sizes = read.table("../mm10.chrom.sizes",stringsAsFactors=F,header=F)
names(chrom.sizes) = c("chr","size")
plus.file = "../pro_plus_merged.bigWig"
minus.file = "../pro_minus_merged.bigWig"
bw.plus = load.bigWig(plus.file)
bw.minus = load.bigWig(minus.file)
#get intervals for furthest TSS and TTS +/- interval
largest.interval.bed = get.largest.interval(bed=gencode.transcript)
# filtering which read count and read density might be consider as insignificant
transcript.reads = read.count.transcript(bed=gencode.transcript, bw.plus=bw.plus, bw.minus=bw.minus)
# evaluate count and density distributions
pdf("Histogram_density_distibution.pdf", useDingbats = FALSE, width=6.4, height=5.4)
par(mfrow=c(1,2))
hist(log(transcript.reads$density), breaks=200,
col="black",xlab="log10 read density",main="")
hist(log(transcript.reads$counts), breaks=200,
col="black",xlab="log10 read count",main="")
dev.off()
# specify which genes to cut based on low expression, visualize cutoffs
pdf("Histogram_density_distibution_treshold.pdf", useDingbats = FALSE, width=6.4, height=5.4)
den.cut = -4
cnt.cut = 2
ind.cut.den = which(log(transcript.reads$density) < den.cut)
ind.cut.cnt = which(log(transcript.reads$counts) < cnt.cut)
ind.cut = union(ind.cut.den, ind.cut.cnt)
par(mfrow=c(1,2))
hist(log(transcript.reads$density), breaks=200,
col="black",xlab="log"[10]~" read density",main="")
abline(v=den.cut, col="red")
hist(log(transcript.reads$counts), breaks=200,
col="black",xlab="log"[10]~" read count",main="")
abline(v=cnt.cut, col="red")
dev.off()
# remove "unexpressed" genes
unexp = names(transcript.reads$counts)[ind.cut]
largest.interval.expr.bed = largest.interval.bed[
!(largest.interval.bed$gene %in% unexp),]
# select the TSS for each gene and incorporate these TSSs
# into the largest interval coordinates
bp.range = c(20,120)
cnt.thresh = 2
bed.out = largest.interval.expr.bed
bed.in = gencode.firstExon[gencode.firstExon$gene %in% bed.out$gene,]
TSS.gene = get.TSS(bed.in=bed.in, bed.out=bed.out,
bw.plus=bw.plus, bw.minus=bw.minus,
bp.range=bp.range, cnt.thresh=cnt.thresh)
TSS.gene = TSS.gene$bed
# parameters and analysis
window = 1000
bp.bin = 10
bed1 = TSS.gene
bed2 = largest.interval.expr.bed
bed2 = bed2[bed2$gene %in% bed1$gene,]
tss.eval = eval.tss(bed1=bed1, bed2=bed2,
bw.plus=bw.plus, bw.minus=bw.minus,
window=window, bp.bin=bp.bin, fname="TSSres.pdf")
#Inferred TSSs showed as distribution of paused RNA polymerase density
pdf("TS.pdf", useDingbats = FALSE, width=6.4, height=5.4)
bk = seq(-window/2,window/2, bp.bin)
hist(tss.eval$tss.dists.inf$dist, main="overlay",
xlab="dist from TSS to read max (bp)",
breaks=bk, col='black')
bk = seq(-window/2,window/2, bp.bin)
hist( tss.eval$tss.dists.lng$dist,
breaks=bk, col='red', add=T)
dev.off()
# identify duplicates
dups = get.dups(bed = TSS.gene)
max(dups$cases)
head(dups)
write.table(dups,"duplicateCoords.bed",quote=F,sep="\t",col.names=F,row.names=F)
#put *merged.bigWig files on cyverse and look at duplicate examples in browser
#annotations that should be entirely removed go into 'duplicate_remove.csv'
#annotations that should be kept should be ordered and put into 'duplicate_keepadjacent.csv'
remove.genes.id <- read.csv(file="duplicate_remove.csv",
header=TRUE, sep=",", stringsAsFactors=F)
fix.genes.id <- read.csv("duplicate_keepadjacent.csv",
header= TRUE, sep=",", stringsAsFactors=F)
# remove genes based on manual analysis, genes cosidered as adjecent will be adressed below
genes.remove1 = c(remove.genes.id$remove, fix.genes.id$upstream, fix.genes.id$downstream)
TSS.gene.filtered1 = TSS.gene[!(TSS.gene$gene %in% genes.remove1),]
# confirm that there are no any overlaps
#this object should have no data
dups1 =get.dups(bed = TSS.gene.filtered1)
# overlap analysis
overlap.data = gene.overlaps(bed = TSS.gene.filtered1)
has.start.inside = overlap.data$has.start.inside
is.a.start.inside = overlap.data$is.a.start.inside
head(has.start.inside)
dim(has.start.inside) #271
head(is.a.start.inside)
dim(is.a.start.inside) #323
head(overlap.data$cases)
dim(overlap.data$cases) #503
# Checking those genes in overlap.gene lists
overlap.genes = overlap.data$cases$gene %>% unique
length(grep("^AC[0-9][0-9]",overlap.genes)) #7
length(grep("^AL[0-9][0-9]",overlap.genes)) #3
length(grep("^AP[0-9][0-9]",overlap.genes)) #0
# set to remove usually poorly annotated genes in overlapping regions
genes.remove2 = overlap.genes[grep("^AC[0-9][0-9]",overlap.genes)]
genes.remove2 = c(genes.remove2, overlap.genes[grep("^AL[0-9][0-9]",overlap.genes)])
genes.remove2 = c(genes.remove2, overlap.genes[grep("^AP[0-9][0-9]",overlap.genes)])
# identify genes with multiple starts inside (i.e. 'big' genes)
mult.inside.starts = inside.starts(vec = is.a.start.inside$xy)
length(mult.inside.starts) #31
# add genes with multiple starts inside to remove
genes.remove2 = c(genes.remove2, mult.inside.starts %>% unique) #41
# remove filtered genes and re-run overlap analysis
in.dat = TSS.gene.filtered1[!(TSS.gene.filtered1$gene %in% genes.remove2),]
overlap.data = gene.overlaps( bed = in.dat )
has.start.inside = overlap.data$has.start.inside
is.a.start.inside = overlap.data$is.a.start.inside
case.dat = overlap.data$cases
length(unique(case.dat$cases)) #171
# data for manual analysis and curration
fname = "nonid_overlaps.txt"
write.table(case.dat,fname,col.names=T,row.names=F,sep="\t",quote=F)
overlaps = case.dat %>% select(chr,start,end,gene,cases)
write.table(overlaps,"nonid_overlaps.bed",quote=F,sep="\t",col.names=F,row.names=F)
write.table(TSS.gene.filtered1,"TSS.gene.filtered1.bed",quote=F,sep="\t",col.names=F,row.names=F)
remove.genes.ov = read.csv("overlaps_remove.csv",
header=TRUE, stringsAsFactors=F)
fix.genes.ov =read.csv("overlaps_keepadjacent.csv",
header=TRUE, stringsAsFactors=F)
genes.remove3 = c(genes.remove2, remove.genes.ov$remove,
fix.genes.ov$upstream, fix.genes.ov$downstream)
# read corrected start/end for 10 genes in bed file
TSS.gene.filtered2 = TSS.gene.filtered1[
!(TSS.gene.filtered1$gene %in% genes.remove3),]
# verify the abscence of overlaps
overlap.data = gene.overlaps( bed = TSS.gene.filtered2 )
overlap.data$cases
fix.genes.ov =read.csv("overlaps_keepadjacent.csv",
header=TRUE, stringsAsFactors=F)
# get the coordinates for adjacent gene pairs
fix.genes = rbind(fix.genes.id, fix.genes.ov)
bp.bin = 5
knot.div = 40
shift.up = 100
delta.tss = 50
diff.tss = 1000
dist.from.start = 50
adjacent.coords = adjacent.gene.coords(fix.genes=fix.genes, bed.long=TSS.gene,
exon1=gencode.firstExon,
bw.plus=bw.plus, bw.minus=bw.minus,
knot.div=knot.div, bp.bin=bp.bin,
shift.up=shift.up, delta.tss=delta.tss,
dist.from.start=dist.from.start,
diff.tss=diff.tss, fname="adjacentSplines.pdf")
# visualize coordinates for a specific pair
adjacent.coords.plot(adjacent.coords=adjacent.coords,
pair=fix.genes[1,],
bw.plus=bw.plus,
bw.minus=bw.minus)
# aggregate downstream adjacent genes with main data
TSS.gene.filtered3 = rbind(TSS.gene.filtered2, adjacent.coords)
overlap.data = gene.overlaps( bed = TSS.gene.filtered3 )
overlap.data$cases
#get intervals for TTS evaluation
add.to.end = 100000
fraction.end = 0.2
dist.from.start = 50
bed.for.tts.eval = get.end.intervals(bed=TSS.gene.filtered3,
add.to.end=add.to.end,
fraction.end=fraction.end,
dist.from.start=dist.from.start)
# distribution of clip distances
pdf("Histogram_TTS_clipdistance.pdf", useDingbats = FALSE, width=6.4, height=5.4)
hist(bed.for.tts.eval$xy, xlab="clip distance (bp)",col="black",main="")
dev.off()
# identify gene ends
add.to.end = max(bed.for.tts.eval$xy)
knot.div = 40
pk.thresh = 0.05
bp.bin = 50
knot.thresh = 5
cnt.thresh = 5
tau.dist = 50000
frac.max = 1
frac.min = 0.3
inferred.coords = get.TTS(bed=bed.for.tts.eval, tss=TSS.gene.filtered3,
bw.plus=bw.plus, bw.minus=bw.minus,
bp.bin=bp.bin, add.to.end=add.to.end,
pk.thresh=pk.thresh, knot.thresh=knot.thresh,
cnt.thresh=cnt.thresh, tau.dist=tau.dist,
frac.max=frac.max, frac.min=frac.min, knot.div=knot.div)
coords = inferred.coords$bed
# check for the percentage of identified TTSs that match the search region boundry
TTS.boundary.match(coords=coords, bed.for.tts.eval=bed.for.tts.eval) #0.1968006
# look at whether TTSs identified at the search boundry had clipped boundries
frac.bound.clip = TTS.boundary.clip(coords=coords, bed.for.tts.eval=bed.for.tts.eval)
frac.bound.clip$frac.clip #0.919607
frac.bound.clip$frac.noclip #0.08039303
# plot didtribution of clip distances for genes with boundary TTSs
pdf("bp_clipped_for_boundry.pdf", useDingbats = FALSE, width=6.4, height=5.4)
hist(frac.bound.clip$clip.tts,main="",col="black",
xlab="bp clipped for boundry genes")
dev.off()
# plot new coord id
gene.end = bed.for.tts.eval
# ploting NR3C1
long.gene = largest.interval.bed
pdf("Genenr3c1.pdf", onefile=FALSE)
tts.plot(coords=coords, gene.end=gene.end, long.gene=long.gene,
gene = "Nr3c1", xper=0.1, yper=0.2,
bw.plus=bw.plus, bw.minus=bw.minus, bp.bin=5,
frac.min=frac.min, frac.max=frac.max,
add.to.end=add.to.end, tau.dist=tau.dist)
dev.off()
# plot tts curve
pdf("Gene_curvenr3c1.pdf", onefile=FALSE)
gene.end.plot(bed=bed.for.tts.eval, gene="Nr3c1",
bw.plus=bw.plus, bw.minus=bw.minus,
bp.bin=bp.bin, add.to.end=add.to.end, knot.div=knot.div,
pk.thresh=pk.thresh, knot.thresh=knot.thresh,
cnt.thresh=cnt.thresh, tau.dist=tau.dist,
frac.max=frac.max, frac.min=frac.min)
dev.off()
write.table(coords,'primary_transcript_annotation.bed',sep='\t',quote=F,col.names=F,row.names=F)
After completing pTA, discard 6day timepoint from rest of analysis
github raw
rm -r 6day
mkdir 6day
mv *6d* 6day